from IPython.core.display import HTML
HTML("""
<style>
.output_png {
display: table-cell;
text-align: center;
vertical-align: middle;
horizontal-align: middle;
}
h1,h2 {
text-align: center;
background-color: black;
padding: 20px;
margin: 0;
color: yellow;
font-family: ariel;
border-radius: 80px
}
h3 {
text-align: center;
border-style: solid;
border-width: 3px;
padding: 12px;
margin: 0;
color: black;
font-family: ariel;
border-radius: 80px;
border-color: gold;
}
body, p {
font-family: ariel;
font-size: 15px;
color: charcoal;
}
div {
font-size: 14px;
margin: 0;
}
h4 {
padding: 0px;
margin: 0;
font-family: ariel;
color: purple;
}
</style>
""")
#pip install chart-studio
Note: you may need to restart the kernel to use updated packages.Collecting chart-studio
Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
-------------------------------------- 64.4/64.4 kB 694.6 kB/s eta 0:00:00
Collecting retrying>=1.3.3
Downloading retrying-1.3.4-py3-none-any.whl (11 kB)
Requirement already satisfied: six in c:\users\user\anaconda3\lib\site-packages (from chart-studio) (1.16.0)
Requirement already satisfied: requests in c:\users\user\anaconda3\lib\site-packages (from chart-studio) (2.28.1)
Requirement already satisfied: plotly in c:\users\user\anaconda3\lib\site-packages (from chart-studio) (5.9.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\user\anaconda3\lib\site-packages (from plotly->chart-studio) (8.0.1)
Requirement already satisfied: idna<4,>=2.5 in c:\users\user\anaconda3\lib\site-packages (from requests->chart-studio) (3.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\user\anaconda3\lib\site-packages (from requests->chart-studio) (2022.12.7)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\user\anaconda3\lib\site-packages (from requests->chart-studio) (2.0.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\user\anaconda3\lib\site-packages (from requests->chart-studio) (1.26.14)
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.4
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
from chart_studio import plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
#initiate visualization library for jupyter notebook
pyoff.init_notebook_mode()
df = pd.read_csv('Online_Retail.csv')
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
| 5 | 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 08:26:00 | 7.65 | 17850.0 | United Kingdom |
| 6 | 536365 | 21730 | GLASS STAR FROSTED T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 4.25 | 17850.0 | United Kingdom |
| 7 | 536366 | 22633 | HAND WARMER UNION JACK | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 8 | 536366 | 22632 | HAND WARMER RED POLKA DOT | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom |
| 9 | 536367 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 32 | 2010-12-01 08:34:00 | 1.69 | 13047.0 | United Kingdom |
df.head(10)
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | Month_Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 | 2010-12 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
| 5 | 536365 | 22752 | SET 7 BABUSHKA NESTING BOXES | 2 | 2010-12-01 08:26:00 | 7.65 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 |
| 6 | 536365 | 21730 | GLASS STAR FROSTED T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 4.25 | 17850.0 | United Kingdom | 201012 | 25.50 | 2010-12 |
| 7 | 536366 | 22633 | HAND WARMER UNION JACK | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom | 201012 | 11.10 | 2010-12 |
| 8 | 536366 | 22632 | HAND WARMER RED POLKA DOT | 6 | 2010-12-01 08:28:00 | 1.85 | 17850.0 | United Kingdom | 201012 | 11.10 | 2010-12 |
| 9 | 536367 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 32 | 2010-12-01 08:34:00 | 1.69 | 13047.0 | United Kingdom | 201012 | 54.08 | 2010-12 |
We have all the crucial information we need:
Customer ID, Unit Price, Quantity, Invoice Date,
With all these features, we can build our North Star Metric equation: Revenue = Active Customer Count Order Count Average Revenue per Order
#converting the type of Invoice Date Field from string to datetime.
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
#creating YearMonth field for the ease of reporting and visualization
#df['InvoiceYearMonth'] = df['InvoiceDate'].map(lambda date: 100*date.year + date.month)
df['Month_Year'] = df['InvoiceDate'].dt.to_period('M')
#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
df['Revenue'] = df['UnitPrice'] * df['Quantity']
df_revenue = df.groupby(['Month_Year'])['Revenue'].sum().reset_index()
df_revenue["Month_Year"] = df_revenue["Month_Year"].dt.strftime("%Y-%b")
df_revenue
| Month_Year | Revenue | |
|---|---|---|
| 0 | 2010-Dec | 676742.620 |
| 1 | 2011-Jan | 434308.300 |
| 2 | 2011-Feb | 408247.910 |
| 3 | 2011-Mar | 559707.390 |
| 4 | 2011-Apr | 442254.041 |
| 5 | 2011-May | 596459.860 |
| 6 | 2011-Jun | 554478.350 |
| 7 | 2011-Jul | 565479.841 |
| 8 | 2011-Aug | 539130.500 |
| 9 | 2011-Sep | 862018.152 |
| 10 | 2011-Oct | 877438.190 |
| 11 | 2011-Nov | 1282805.780 |
| 12 | 2011-Dec | 388735.430 |
#X and Y axis inputs for Plotly graph. We use Scatter for line graphs
plot_data = [
go.Scatter(
x=df_revenue['Month_Year'],
y=df_revenue['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
This clearly shows our revenue is growing especially Aug ‘11 onwards (and our data in December is incomplete). Absolute numbers are fine, let’s figure out what is our Monthly Revenue Growth Rate:
#using pct_change() function to see monthly percentage change
df_revenue['Monthly_Growth'] = df_revenue['Revenue'].pct_change()
#df_revenue.drop('MonthlyGrowth', inplace=True, axis=1) # if you want to drop column
#showing first 5 rows
df_revenue
| Month_Year | Revenue | Monthly_Growth | |
|---|---|---|---|
| 0 | 2010-Dec | 676742.620 | NaN |
| 1 | 2011-Jan | 434308.300 | -0.358237 |
| 2 | 2011-Feb | 408247.910 | -0.060004 |
| 3 | 2011-Mar | 559707.390 | 0.370999 |
| 4 | 2011-Apr | 442254.041 | -0.209848 |
| 5 | 2011-May | 596459.860 | 0.348682 |
| 6 | 2011-Jun | 554478.350 | -0.070384 |
| 7 | 2011-Jul | 565479.841 | 0.019841 |
| 8 | 2011-Aug | 539130.500 | -0.046596 |
| 9 | 2011-Sep | 862018.152 | 0.598904 |
| 10 | 2011-Oct | 877438.190 | 0.017888 |
| 11 | 2011-Nov | 1282805.780 | 0.461990 |
| 12 | 2011-Dec | 388735.430 | -0.696965 |
df_revenue["M"]
#visualization - line graph
plot_data = [
go.Scatter(
x=df_revenue['Month_Year'].iloc[0:12],
y=df_revenue['Monthly_Growth'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Growth Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
We need to identify what exactly happened on April. Was it due to less active customers or our customers did less orders?
We can get the monthly active customers by counting unique CustomerIDs.
#creating a new dataframe with UK customers only
df_uk = df.loc[df['Country'] == 'United Kingdom'].reset_index(drop=True)
#creating monthly active customers dataframe by counting unique Customer IDs
df_monthly_active = df_uk.groupby('Month_Year')['CustomerID'].nunique().reset_index()
df_monthly_active["Month_Year"] = df_monthly_active["Month_Year"].dt.strftime("%Y-%b")
#print the dataframe
df_monthly_active
| Month_Year | CustomerID | |
|---|---|---|
| 0 | 2010-Dec | 871 |
| 1 | 2011-Jan | 684 |
| 2 | 2011-Feb | 714 |
| 3 | 2011-Mar | 923 |
| 4 | 2011-Apr | 817 |
| 5 | 2011-May | 985 |
| 6 | 2011-Jun | 943 |
| 7 | 2011-Jul | 899 |
| 8 | 2011-Aug | 867 |
| 9 | 2011-Sep | 1177 |
| 10 | 2011-Oct | 1285 |
| 11 | 2011-Nov | 1548 |
| 12 | 2011-Dec | 617 |
#plotting the output
plot_data = [
go.Bar(
x=df_monthly_active['Month_Year'],
y=df_monthly_active['CustomerID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Active Customers'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In April, Monthly Active Customer number dropped to 817 from 923 (-11.5%).
We will apply the same code by using Quantity field:
#create a new dataframe for no. of order by using quantity field
df_monthly_sales = df_uk.groupby('Month_Year')['Quantity'].sum().reset_index()
df_monthly_sales["Month_Year"] = df_monthly_sales["Month_Year"].dt.strftime("%Y-%b")
#print the dataframe
df_monthly_sales
#plot
plot_data = [
go.Bar(
x=df_monthly_sales['Month_Year'],
y=df_monthly_sales['Quantity'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Total # of Order'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
As we expected, Order Count is also declined in April (279k to 257k, -8%)
We know that Active Customer Count directly affected Order Count decrease.
To get this data, we need to calculate the average of revenue for each month:
# create a new dataframe for average revenue by taking the mean of it
df_monthly_order_avg = df_uk.groupby('Month_Year')['Revenue'].mean().reset_index()
df_monthly_order_avg["Month_Year"] = df_monthly_order_avg["Month_Year"].dt.strftime("%Y-%b")
#print the dataframe
df_monthly_order_avg
| Month_Year | Revenue | |
|---|---|---|
| 0 | 2010-Dec | 16.865860 |
| 1 | 2011-Jan | 13.614680 |
| 2 | 2011-Feb | 16.093027 |
| 3 | 2011-Mar | 16.716166 |
| 4 | 2011-Apr | 15.773380 |
| 5 | 2011-May | 17.713823 |
| 6 | 2011-Jun | 16.714748 |
| 7 | 2011-Jul | 15.723497 |
| 8 | 2011-Aug | 17.315899 |
| 9 | 2011-Sep | 18.931723 |
| 10 | 2011-Oct | 16.093582 |
| 11 | 2011-Nov | 16.312383 |
| 12 | 2011-Dec | 16.247406 |
#plot the bar chart
plot_data = [
go.Bar(
x=df_monthly_order_avg['Month_Year'],
y=df_monthly_order_avg['Revenue'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Order Average'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
New Customer Ratio: a good indicator of if we are losing our existing customers or unable to attract new ones
First we should define what is a new customer. In our dataset, we can assume a new customer is whoever did his/her first purchase in the time window we defined. We will do it monthly for this example.
We will be using .min() function to find our first purchase date for each customer and define new customers based on that.
df_uk['InvoiceYearMonth'] = df['InvoiceDate'].map(lambda date: 100*date.year + date.month)
df_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | Month_Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 | 2010-12 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 |
#create a dataframe contaning CustomerID and first purchase date
df_min_purchase = df_uk.groupby('CustomerID').InvoiceDate.min().reset_index()
df_min_purchase.columns = ['CustomerID','MinPurchaseDate']
#df_min_purchase['MinPurchaseYearMonth'] = df_min_purchase['MinPurchaseDate'].dt.to_period('M') # now we not use this
df_min_purchase['MinPurchaseYearMonth'] = df_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)
#df_min_purchase['MinPurchaseYearMonth'] = df_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)
df_min_purchase.head()
| CustomerID | MinPurchaseDate | MinPurchaseYearMonth | |
|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 | 201101 |
| 1 | 12747.0 | 2010-12-05 15:38:00 | 201012 |
| 2 | 12748.0 | 2010-12-01 12:48:00 | 201012 |
| 3 | 12749.0 | 2011-05-10 15:25:00 | 201105 |
| 4 | 12820.0 | 2011-01-17 12:34:00 | 201101 |
#merge first purchase date column to our main dataframe (tx_uk)
df_uk = pd.merge(df_uk, df_min_purchase, on='CustomerID')
df_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | Month_Year | MinPurchaseDate | MinPurchaseYearMonth | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 | 2010-12-01 08:26:00 | 201012 |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 | 2010-12 | 2010-12-01 08:26:00 | 201012 |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 |
#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
df_uk['UserType'] = 'New'
df_uk.loc[df_uk['InvoiceYearMonth']> df_uk['MinPurchaseYearMonth'],'UserType'] = 'Existing'
df_uk.UserType.value_counts()
Existing 256114 New 105764 Name: UserType, dtype: int64
df_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | Month_Year | MinPurchaseDate | MinPurchaseYearMonth | UserType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
df_user_type_revenue = df_uk.groupby(['InvoiceYearMonth','UserType'])['Revenue'].sum().reset_index()
df_user_type_revenue.query("InvoiceYearMonth != 201012 and InvoiceYearMonth != 201112")
| InvoiceYearMonth | UserType | Revenue | |
|---|---|---|---|
| 1 | 201101 | Existing | 195275.510 |
| 2 | 201101 | New | 156705.770 |
| 3 | 201102 | Existing | 220994.630 |
| 4 | 201102 | New | 127859.000 |
| 5 | 201103 | Existing | 296350.030 |
| 6 | 201103 | New | 160567.840 |
| 7 | 201104 | Existing | 268226.660 |
| 8 | 201104 | New | 108517.751 |
| 9 | 201105 | Existing | 434725.860 |
| 10 | 201105 | New | 90847.490 |
| 11 | 201106 | Existing | 408030.060 |
| 12 | 201106 | New | 64479.190 |
| 13 | 201107 | Existing | 407693.610 |
| 14 | 201107 | New | 53453.991 |
| 15 | 201108 | Existing | 421388.930 |
| 16 | 201108 | New | 55619.480 |
| 17 | 201109 | Existing | 640861.901 |
| 18 | 201109 | New | 135667.941 |
| 19 | 201110 | Existing | 648837.600 |
| 20 | 201110 | New | 133940.280 |
| 21 | 201111 | Existing | 838955.910 |
| 22 | 201111 | New | 117153.750 |
df_user_type_revenue = df_user_type_revenue.query("InvoiceYearMonth != 201012 and InvoiceYearMonth != 201112")
plot_data = [
go.Scatter(
x=df_user_type_revenue.query("UserType == 'Existing'")['InvoiceYearMonth'],
y=df_user_type_revenue.query("UserType == 'Existing'")['Revenue'],
name = 'Existing'
),
go.Scatter(
x=df_user_type_revenue.query("UserType == 'New'")['InvoiceYearMonth'],
y=df_user_type_revenue.query("UserType == 'New'")['Revenue'],
name = 'New'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New vs Existing'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_user_ratio = df_uk.query("UserType == 'New'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()/df_uk.query("UserType == 'Existing'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()
df_user_ratio = df_user_ratio.reset_index()
df_user_ratio = df_user_ratio.dropna()
df_uk.query("UserType == 'New'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()
InvoiceYearMonth 201012 871 201101 362 201102 339 201103 408 201104 276 201105 252 201106 207 201107 172 201108 140 201109 275 201110 318 201111 296 201112 34 Name: CustomerID, dtype: int64
df_uk.query("UserType == 'Existing'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()
InvoiceYearMonth 201101 322 201102 375 201103 515 201104 541 201105 733 201106 736 201107 727 201108 727 201109 902 201110 967 201111 1252 201112 583 Name: CustomerID, dtype: int64
plot_data = [
go.Bar(
x=df_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=df_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['CustomerID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='New Customer Ratio'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Signup Data
df_min_purchase.head()
| CustomerID | MinPurchaseDate | MinPurchaseYearMonth | |
|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 | 201101 |
| 1 | 12747.0 | 2010-12-05 15:38:00 | 201012 |
| 2 | 12748.0 | 2010-12-01 12:48:00 | 201012 |
| 3 | 12749.0 | 2011-05-10 15:25:00 | 201105 |
| 4 | 12820.0 | 2011-01-17 12:34:00 | 201101 |
unq_month_year = df_min_purchase.MinPurchaseYearMonth.unique()
def generate_signup_date(year_month):
signup_date = [el for el in unq_month_year if year_month >= el]
return np.random.choice(signup_date)
df_min_purchase['SignupYearMonth'] = df_min_purchase.apply(lambda row: generate_signup_date(row['MinPurchaseYearMonth']),axis=1)
df_min_purchase['InstallYearMonth'] = df_min_purchase.apply(lambda row: generate_signup_date(row['SignupYearMonth']),axis=1)
df_min_purchase.head()
| CustomerID | MinPurchaseDate | MinPurchaseYearMonth | SignupYearMonth | InstallYearMonth | |
|---|---|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 | 201101 | 201012 | 201012 |
| 1 | 12747.0 | 2010-12-05 15:38:00 | 201012 | 201012 | 201012 |
| 2 | 12748.0 | 2010-12-01 12:48:00 | 201012 | 201012 | 201012 |
| 3 | 12749.0 | 2011-05-10 15:25:00 | 201105 | 201103 | 201102 |
| 4 | 12820.0 | 2011-01-17 12:34:00 | 201101 | 201012 | 201012 |
channels = ['organic','inorganic','referral']
df_min_purchase['AcqChannel'] = df_min_purchase.apply(lambda x: np.random.choice(channels),axis=1)
df_activation = df_min_purchase[df_min_purchase['MinPurchaseYearMonth'] == df_min_purchase['SignupYearMonth']].groupby('SignupYearMonth').CustomerID.count()/df_min_purchase.groupby('SignupYearMonth').CustomerID.count()
df_activation = df_activation.reset_index()
plot_data = [
go.Bar(
x=df_activation.query("SignupYearMonth>201101 and SignupYearMonth<201109")['SignupYearMonth'],
y=df_activation.query("SignupYearMonth>201101 and SignupYearMonth<201109")['CustomerID'],
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Activation Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
df_activation_ch = df_min_purchase[df_min_purchase['MinPurchaseYearMonth'] == df_min_purchase['SignupYearMonth']].groupby(['SignupYearMonth','AcqChannel']).CustomerID.count()/df_min_purchase.groupby(['SignupYearMonth','AcqChannel']).CustomerID.count()
df_activation_ch = df_activation_ch.reset_index()
plot_data = [
go.Scatter(
x=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'organic'")['SignupYearMonth'],
y=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'organic'")['CustomerID'],
name="organic"
),
go.Scatter(
x=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'inorganic'")['SignupYearMonth'],
y=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'inorganic'")['CustomerID'],
name="inorganic"
),
go.Scatter(
x=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'referral'")['SignupYearMonth'],
y=df_activation_ch.query("SignupYearMonth>201101 and SignupYearMonth<201108 and AcqChannel == 'referral'")['CustomerID'],
name="referral"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Activation Rate - Channel Based'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Retention Rate: King of the metrics. Indicates how many customers we retain over specific time window. We will be showing examples for monthly retention rate and cohort based retention rate. Retention rate should be monitored very closely because it indicates how sticky is your service and how well your product fits the market. For making Monthly Retention Rate visualized, we need to calculate how many customers retained from previous month.
Monthly Retention Rate = Retained Customers From Prev. Month/Active Customers Total
We will be using crosstab() function of pandas which makes calculating Retention Rate super easy.
df_uk.head()
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | InvoiceYearMonth | Revenue | Month_Year | MinPurchaseDate | MinPurchaseYearMonth | UserType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom | 201012 | 15.30 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom | 201012 | 22.00 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
| 4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom | 201012 | 20.34 | 2010-12 | 2010-12-01 08:26:00 | 201012 | New |
df_monthly_active = df_uk.groupby('InvoiceYearMonth')['CustomerID'].nunique().reset_index()
df_user_purchase = df_uk.groupby(['CustomerID','InvoiceYearMonth'])['Revenue'].sum().astype(int).reset_index()
df_user_purchase
| CustomerID | InvoiceYearMonth | Revenue | |
|---|---|---|---|
| 0 | 12346.0 | 201101 | 0 |
| 1 | 12747.0 | 201012 | 706 |
| 2 | 12747.0 | 201101 | 303 |
| 3 | 12747.0 | 201103 | 310 |
| 4 | 12747.0 | 201105 | 771 |
| ... | ... | ... | ... |
| 12325 | 18283.0 | 201110 | 114 |
| 12326 | 18283.0 | 201111 | 651 |
| 12327 | 18283.0 | 201112 | 208 |
| 12328 | 18287.0 | 201105 | 765 |
| 12329 | 18287.0 | 201110 | 1072 |
12330 rows × 3 columns
df_user_purchase.Revenue.sum()
6762581
df_retention = pd.crosstab(df_user_purchase['CustomerID'], df_user_purchase['InvoiceYearMonth']).reset_index()
df_retention.head()
| InvoiceYearMonth | CustomerID | 201012 | 201101 | 201102 | 201103 | 201104 | 201105 | 201106 | 201107 | 201108 | 201109 | 201110 | 201111 | 201112 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12346.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 12747.0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| 2 | 12748.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 3 | 12749.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
| 4 | 12820.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
months = df_retention.columns[2:]
months
Index([201101, 201102, 201103, 201104, 201105, 201106, 201107, 201108, 201109,
201110, 201111, 201112],
dtype='object', name='InvoiceYearMonth')
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['InvoiceYearMonth'] = int(selected_month)
retention_data['TotalUserCount'] = df_retention[selected_month].sum()
retention_data['RetainedUserCount'] = df_retention[(df_retention[selected_month]>0) & (df_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)
df_retention = pd.DataFrame(retention_array)
df_retention.head()
| InvoiceYearMonth | TotalUserCount | RetainedUserCount | |
|---|---|---|---|
| 0 | 201102 | 714 | 263 |
| 1 | 201103 | 923 | 305 |
| 2 | 201104 | 817 | 310 |
| 3 | 201105 | 985 | 369 |
| 4 | 201106 | 943 | 417 |
df_retention['RetentionRate'] = df_retention['RetainedUserCount']/df_retention['TotalUserCount']
df_retention
| InvoiceYearMonth | TotalUserCount | RetainedUserCount | RetentionRate | |
|---|---|---|---|---|
| 0 | 201102 | 714 | 263 | 0.368347 |
| 1 | 201103 | 923 | 305 | 0.330444 |
| 2 | 201104 | 817 | 310 | 0.379437 |
| 3 | 201105 | 985 | 369 | 0.374619 |
| 4 | 201106 | 943 | 417 | 0.442206 |
| 5 | 201107 | 899 | 379 | 0.421580 |
| 6 | 201108 | 867 | 391 | 0.450980 |
| 7 | 201109 | 1177 | 417 | 0.354291 |
| 8 | 201110 | 1285 | 502 | 0.390661 |
| 9 | 201111 | 1548 | 616 | 0.397933 |
| 10 | 201112 | 617 | 402 | 0.651540 |
plot_data = [
go.Scatter(
x=df_retention.query("InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=df_retention.query("InvoiceYearMonth<201112")['RetentionRate'],
name="organic"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Retention Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Monthly Retention Rate significantly jumped from June to August and went back to previous levels afterwards.
df_retention['ChurnRate'] = 1- df_retention['RetentionRate']
plot_data = [
go.Scatter(
x=df_retention.query("InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=df_retention.query("InvoiceYearMonth<201112")['ChurnRate'],
name="organic"
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Churn Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
There is another way of measuring Retention Rate which allows you to see Retention Rate for each cohort. Cohorts are determined as first purchase year-month of the customers. We will be measuring what percentage of the customers retained after their first purchase in each month. This view will help us to see how recent and old cohorts differ regarding retention rate and if recent changes in customer experience affected new customer’s retention or not.
df_user_purchase.head()
| CustomerID | InvoiceYearMonth | Revenue | |
|---|---|---|---|
| 0 | 12346.0 | 201101 | 0 |
| 1 | 12747.0 | 201012 | 706 |
| 2 | 12747.0 | 201101 | 303 |
| 3 | 12747.0 | 201103 | 310 |
| 4 | 12747.0 | 201105 | 771 |
df_min_purchase.head()
| CustomerID | MinPurchaseDate | MinPurchaseYearMonth | SignupYearMonth | InstallYearMonth | AcqChannel | |
|---|---|---|---|---|---|---|
| 0 | 12346.0 | 2011-01-18 10:01:00 | 201101 | 201012 | 201012 | referral |
| 1 | 12747.0 | 2010-12-05 15:38:00 | 201012 | 201012 | 201012 | referral |
| 2 | 12748.0 | 2010-12-01 12:48:00 | 201012 | 201012 | 201012 | inorganic |
| 3 | 12749.0 | 2011-05-10 15:25:00 | 201105 | 201103 | 201102 | organic |
| 4 | 12820.0 | 2011-01-17 12:34:00 | 201101 | 201012 | 201012 | organic |
df_retention = pd.crosstab(df_user_purchase['CustomerID'], df_user_purchase['InvoiceYearMonth']).reset_index()
df_retention = pd.merge(df_retention,df_min_purchase[['CustomerID','MinPurchaseYearMonth']],on='CustomerID')
df_retention.head()
| CustomerID | 201012 | 201101 | 201102 | 201103 | 201104 | 201105 | 201106 | 201107 | 201108 | 201109 | 201110 | 201111 | 201112 | MinPurchaseYearMonth | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12346.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 201101 |
| 1 | 12747.0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 201012 |
| 2 | 12748.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 201012 |
| 3 | 12749.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 201105 |
| 4 | 12820.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 201101 |
new_column_names = [ 'm_' + str(column) for column in df_retention.columns[:-1]]
new_column_names.append('MinPurchaseYearMonth')
df_retention.columns = new_column_names
df_retention
| m_CustomerID | m_201012 | m_201101 | m_201102 | m_201103 | m_201104 | m_201105 | m_201106 | m_201107 | m_201108 | m_201109 | m_201110 | m_201111 | m_201112 | MinPurchaseYearMonth | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12346.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 201101 |
| 1 | 12747.0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 1 | 201012 |
| 2 | 12748.0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 201012 |
| 3 | 12749.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 201105 |
| 4 | 12820.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 201101 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3945 | 18280.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 201103 |
| 3946 | 18281.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 201106 |
| 3947 | 18282.0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 201108 |
| 3948 | 18283.0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 201101 |
| 3949 | 18287.0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 201105 |
3950 rows × 15 columns
retention_array = []
for i in range(len(months)):
retention_data = {}
selected_month = months[i]
prev_months = months[:i]
next_months = months[i+1:]
for prev_month in prev_months:
retention_data[prev_month] = np.nan
total_user_count = df_retention[df_retention.MinPurchaseYearMonth == selected_month].MinPurchaseYearMonth.count()
retention_data['TotalUserCount'] = total_user_count
retention_data[selected_month] = 1
query = "MinPurchaseYearMonth == {}".format(selected_month)
for next_month in next_months:
new_query = query + " and {} > 0".format(str('m_' + str(next_month)))
retention_data[next_month] = np.round(df_retention.query(new_query)['m_' + str(next_month)].sum()/total_user_count,2)
retention_array.append(retention_data)
df_retention = pd.DataFrame(retention_array)
len(months)
12
df_retention.index = months
df_retention
| TotalUserCount | 201101 | 201102 | 201103 | 201104 | 201105 | 201106 | 201107 | 201108 | 201109 | 201110 | 201111 | 201112 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| InvoiceYearMonth | |||||||||||||
| 201101 | 362 | 1.0 | 0.23 | 0.28 | 0.25 | 0.34 | 0.29 | 0.26 | 0.26 | 0.31 | 0.35 | 0.36 | 0.15 |
| 201102 | 339 | NaN | 1.00 | 0.25 | 0.19 | 0.28 | 0.28 | 0.25 | 0.26 | 0.28 | 0.28 | 0.31 | 0.10 |
| 201103 | 408 | NaN | NaN | 1.00 | 0.19 | 0.26 | 0.22 | 0.23 | 0.17 | 0.26 | 0.24 | 0.29 | 0.09 |
| 201104 | 276 | NaN | NaN | NaN | 1.00 | 0.22 | 0.22 | 0.22 | 0.21 | 0.23 | 0.23 | 0.26 | 0.08 |
| 201105 | 252 | NaN | NaN | NaN | NaN | 1.00 | 0.23 | 0.17 | 0.17 | 0.21 | 0.24 | 0.27 | 0.10 |
| 201106 | 207 | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.21 | 0.16 | 0.25 | 0.26 | 0.32 | 0.10 |
| 201107 | 172 | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.20 | 0.19 | 0.23 | 0.28 | 0.11 |
| 201108 | 140 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.26 | 0.23 | 0.26 | 0.14 |
| 201109 | 275 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.29 | 0.33 | 0.12 |
| 201110 | 318 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.27 | 0.13 |
| 201111 | 296 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 | 0.14 |
| 201112 | 34 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.00 |
We can see that first month retention rate became better recently (don’t take Dec ’11 into account) and in almost 1 year, 15% of our customers retain with us.